import pandas as pd import numpy as npfrom lets_plot import*from types import GeneratorTypeimport pandas as pdimport numpy as npfrom sklearn.model_selection import train_test_splitfrom sklearn.naive_bayes import GaussianNBfrom sklearn.ensemble import RandomForestClassifierfrom sklearn.ensemble import GradientBoostingClassifierfrom sklearn.tree import DecisionTreeClassifierfrom sklearn import metricsLetsPlot.setup_html(isolated_frame=True)
Tidy Data vs. ML-Ready Data
Import the datasets below. Spend some time getting to know the data.
What are the column names? How many rows?
Based on the task in Canvas, what is the Target?
What are the factors/features you think will be most predictive of the Target?
Show the code
# import your data here using pandas and the URLdwellings = pd.read_csv('https://github.com/byuidatascience/data4dwellings/raw/master/data-raw/dwellings_denver/dwellings_denver.csv')dwellings_ml = pd.read_csv('https://github.com/byuidatascience/data4dwellings/raw/master/data-raw/dwellings_ml/dwellings_ml.csv')# print(dwellings)# print(dwellings_ml)rows, columns = dwellings.shapecolumn_names = dwellings.columnsprint(rows)print(columns)print(column_names)print("there are 22913 rows and 26 col in dwellings")print("""Col. names: 'parcel', 'nbhd', 'abstrprd', 'livearea', 'finbsmnt', 'basement', 'yrbuilt', 'condition', 'quality', 'totunits', 'stories', 'gartype' 'nocars', 'xtraffic', 'floorlvl', 'numbdrm', 'numbaths', 'arcstyle', 'sprice', 'deduct', 'netprice', 'tasp', 'smonth', 'syear', 'qualified','status'""" )rowsA, columnsB = dwellings_ml.shapecolumn_namesA = dwellings_ml.columnsprint(rowsA)print(columnsB)print(column_namesA)print("there are 22913 rows and 51 col in dwellings_ml")print("""Col names in dwellings_ml: 'parcel', 'abstrprd', 'livearea', 'finbsmnt', 'basement', 'yrbuilt', 'totunits', 'stories', 'nocars', 'numbdrm', 'numbaths', 'sprice', 'deduct', 'netprice', 'tasp', 'smonth', 'syear', 'condition_AVG', 'condition_Excel', 'condition_Fair', 'condition_Good', 'condition_VGood', 'quality_A', 'quality_B', 'quality_C', 'quality_D', 'quality_X', 'gartype_Att', 'gartype_Att/Det', 'gartype_CP', 'gartype_Det', 'gartype_None', 'gartype_att/CP', 'gartype_det/CP', 'arcstyle_BI-LEVEL', 'arcstyle_CONVERSIONS', 'arcstyle_END UNIT', 'arcstyle_MIDDLE UNIT', 'arcstyle_ONE AND HALF-STORY', 'arcstyle_ONE-STORY', 'arcstyle_SPLIT LEVEL', 'arcstyle_THREE-STORY', 'arcstyle_TRI-LEVEL', 'arcstyle_TRI-LEVEL WITH BASEMENT', 'arcstyle_TWO AND HALF-STORY', 'arcstyle_TWO-STORY', 'qualified_Q', 'qualified_U', 'status_I', 'status_V', 'before1980'""")print("The target is to find homes built pre 1980")print("The factors that i think will be most helpful to finding out what houses were built before 1980 are: condition, arcstyle, netprice, condition_Fair and , condition_Good ")
22913
26
Index(['parcel', 'nbhd', 'abstrprd', 'livearea', 'finbsmnt', 'basement',
'yrbuilt', 'condition', 'quality', 'totunits', 'stories', 'gartype',
'nocars', 'xtraffic', 'floorlvl', 'numbdrm', 'numbaths', 'arcstyle',
'sprice', 'deduct', 'netprice', 'tasp', 'smonth', 'syear', 'qualified',
'status'],
dtype='object')
there are 22913 rows and 26 col in dwellings
Col. names: 'parcel', 'nbhd', 'abstrprd', 'livearea', 'finbsmnt', 'basement', 'yrbuilt', 'condition', 'quality', 'totunits', 'stories', 'gartype' 'nocars', 'xtraffic', 'floorlvl', 'numbdrm', 'numbaths', 'arcstyle', 'sprice', 'deduct', 'netprice', 'tasp', 'smonth', 'syear', 'qualified',
'status'
22913
51
Index(['parcel', 'abstrprd', 'livearea', 'finbsmnt', 'basement', 'yrbuilt',
'totunits', 'stories', 'nocars', 'numbdrm', 'numbaths', 'sprice',
'deduct', 'netprice', 'tasp', 'smonth', 'syear', 'condition_AVG',
'condition_Excel', 'condition_Fair', 'condition_Good',
'condition_VGood', 'quality_A', 'quality_B', 'quality_C', 'quality_D',
'quality_X', 'gartype_Att', 'gartype_Att/Det', 'gartype_CP',
'gartype_Det', 'gartype_None', 'gartype_att/CP', 'gartype_det/CP',
'arcstyle_BI-LEVEL', 'arcstyle_CONVERSIONS', 'arcstyle_END UNIT',
'arcstyle_MIDDLE UNIT', 'arcstyle_ONE AND HALF-STORY',
'arcstyle_ONE-STORY', 'arcstyle_SPLIT LEVEL', 'arcstyle_THREE-STORY',
'arcstyle_TRI-LEVEL', 'arcstyle_TRI-LEVEL WITH BASEMENT',
'arcstyle_TWO AND HALF-STORY', 'arcstyle_TWO-STORY', 'qualified_Q',
'qualified_U', 'status_I', 'status_V', 'before1980'],
dtype='object')
there are 22913 rows and 51 col in dwellings_ml
Col names in dwellings_ml: 'parcel', 'abstrprd', 'livearea', 'finbsmnt', 'basement', 'yrbuilt',
'totunits', 'stories', 'nocars', 'numbdrm', 'numbaths', 'sprice',
'deduct', 'netprice', 'tasp', 'smonth', 'syear', 'condition_AVG',
'condition_Excel', 'condition_Fair', 'condition_Good',
'condition_VGood', 'quality_A', 'quality_B', 'quality_C', 'quality_D',
'quality_X', 'gartype_Att', 'gartype_Att/Det', 'gartype_CP',
'gartype_Det', 'gartype_None', 'gartype_att/CP', 'gartype_det/CP',
'arcstyle_BI-LEVEL', 'arcstyle_CONVERSIONS', 'arcstyle_END UNIT',
'arcstyle_MIDDLE UNIT', 'arcstyle_ONE AND HALF-STORY',
'arcstyle_ONE-STORY', 'arcstyle_SPLIT LEVEL', 'arcstyle_THREE-STORY',
'arcstyle_TRI-LEVEL', 'arcstyle_TRI-LEVEL WITH BASEMENT',
'arcstyle_TWO AND HALF-STORY', 'arcstyle_TWO-STORY', 'qualified_Q',
'qualified_U', 'status_I', 'status_V', 'before1980'
The target is to find homes built pre 1980
The factors that i think will be most helpful to finding out what houses were built before 1980 are: condition, arcstyle, netprice, condition_Fair and , condition_Good
Describe the main differences between the two datasets.
Exploration
The Target
Create appropriate numerical and graphical summaries of the target.
Show the code
plotA = ggplot(dwellings, aes(x='yrbuilt')) +\ geom_histogram(binwidth=10, fill='#0000FF', color='black') +\ ggtitle('Year Built in the dwellings data') +\ xlab('Year Built') +\ ylab('Number of Houses')plotB = ggplot(dwellings_ml, aes(x='yrbuilt')) +\ geom_histogram(binwidth=10, fill='#FF0000', color='black') +\ ggtitle('Year Built in the dwellings_ml data') +\ xlab('Year Built') +\ ylab('Number of Houses')display(plotA)display(plotB)average_valueA = dwellings['yrbuilt'].mean()average_valueB = dwellings_ml['yrbuilt'].mean()print(f"the average year of houses built for dwellings: {average_valueA:.0f}")print(f"the average year for houses built for dwellings_ml: {average_valueA:.0f}")
the average year of houses built for dwellings: 1964
the average year for houses built for dwellings_ml: 1964
Relationships
Make 3 graphs that visualize the realtionship between the target and 3 features you think might predict the target.
Show the code
# Scatter Plot 1: Architecture style (number) vs storiesplot1 = ggplot(dwellings, aes(x='arcstyle', y='stories')) +\ geom_point(color='blue', alpha=0.6) +\ ggtitle('Style vs Stories') +\ xlab('Architecture Style (number)') +\ ylab('Stories')# Scatter Plot 2: Garage type (number) vs storiesplot2 = ggplot(dwellings, aes(x='gartype', y='stories')) +\ geom_point(color='green', alpha=0.6) +\ ggtitle('Garage Type vs Stories') +\ xlab('Garage Type (number)') +\ ylab('Stories')# Scatter Plot 3: Quality vs Livable Areaplot3 = ggplot(dwellings, aes(x='quality', y='livearea')) +\ geom_point(color='orange', alpha=0.6) +\ ggtitle('Quality vs Livable Area') +\ xlab('Quality') +\ ylab('Livable Area (sq ft)')# Show the plotsdisplay(plot1)display(plot2)display(plot3)
Build a Model
Define X and y
Show the code
#what var will help narrow it down? x = dwellings_ml.filter(["quality", "condition", "livearea", "stories", "arcstyle", "basement", "condition_Fair", "nocars", "numbdrm", "netprice", "numbaths", "sprice", "qualified_Q", "deduct", "finbsmnt", "abstrprd"]) # was it built before 1980 y = dwellings_ml['before1980']
# Create a decision treeclassifier_DT = GradientBoostingClassifier(max_depth =10)#classifier_DT = DecisionTreeClassifier(max_depth = 10)# Fit the decision treeclassifier_DT.fit(x_train, y_train)# Test the decision tree (make predictions)y_predicted_DT = classifier_DT.predict(x_test)# Evaluate the decision treeprint("Accuracy:", metrics.accuracy_score(y_test, y_predicted_DT))
Accuracy: 0.9149029020292385
Work on the model until you can get >90% accuracy.
Try using different feature sets, different models, or different parameters within the models.